Olivia’s Cleaning

Goals of this notebook

The steps we’ll take to prepare our data:

  • Download the data
  • Import it into our notebook
  • Clean up data types and columns
  • Export the data for next notebook

Setup

library(tidyverse)
library(janitor)

Importing data

We’re importing the data which was collected in a PIR by KUT’s Lauren McGaughy.

spurs_raw <- read_csv("data-raw/spurs_raw_data.csv") |> clean_names()
Rows: 50162 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (15): Arrest Date, Arrest County, Arrest Division, Arrest State, Arrest ...
dbl  (2): Arrest ID, Person Age

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spurs_raw

Changing name

spurs_renamed <- spurs_raw |> 
  rename(arrest_date_old = arrest_date,
         charge = offense,
         severity_code = level_group,
         arrest_officer = arrested_by,
         gender_code = person_sex,
         statute = citation,
         ethnicity_race = person_race
         )

spurs_renamed

Changing dates

spurs_date <- spurs_renamed |>
  mutate(
    charge_date = mdy_hm(arrest_date_old) |> date()
  )

Changing officer column

spurs_officer_cleaning <- spurs_date |> 
  mutate(new_officer = str_split_i(arrest_officer, " - ", 1), .after = arrest_officer,
         id_officer = str_split_i(arrest_officer, " - ", 2)
         )

spurs_officer_cleaning

Changing ethnicity column

spurs_race <- spurs_officer_cleaning |> 
  mutate(
    person_race_abbr = case_match(ethnicity_race, "White" ~ "W", "Hispanic" ~ "H", "Hispanic or Latino" ~ "H", "Black" ~ "B", "Asian" ~ "A"),
    person_gender_abbr = case_match(gender_code, "M - Male" ~ "M", "F - Female" ~ "F")
    )

spurs_race

Changing names to uppercase

spurs_uppercase <- spurs_race |> 
  mutate(person_first_name = str_to_upper(person_first_name),
         person_last_name = str_to_upper(person_last_name))

spurs_uppercase

Creating a charge count

spurs_charge_count <- spurs_uppercase |>
  group_by(arrest_date_old, arrest_county, arrest_division, arrest_id, arrest_state, arrest_urn, arrest_officer, new_officer, id_officer, statute, code, inv_number, severity_code, charge, person_first_name, person_last_name, ethnicity_race, gender_code, person_age, charge_date, person_race_abbr, person_gender_abbr ) |> 
  summarise(charge_count = n()) |> 
  mutate(charge_count = as.numeric(charge_count))
`summarise()` has grouped output by 'arrest_date_old', 'arrest_county',
'arrest_division', 'arrest_id', 'arrest_state', 'arrest_urn', 'arrest_officer',
'new_officer', 'id_officer', 'statute', 'code', 'inv_number', 'severity_code',
'charge', 'person_first_name', 'person_last_name', 'ethnicity_race',
'gender_code', 'person_age', 'charge_date', 'person_race_abbr'. You can
override using the `.groups` argument.
spurs_charge_count

Severity code

spurs_code_clean <- spurs_charge_count |> 
  mutate(severity_code_clean = str_remove(severity_code, "SPURS "), .after = severity_code)

spurs_code_clean

Selecting which columns I actually need

spurs_clean <- spurs_code_clean |> ungroup() |> 
  select(
    arrest_county,
    arrest_division,
    arrest_id,
    arrest_state,
    arrest_urn,
    arresting_officer = new_officer,
    spurs_officer_id = id_officer,
    statute,
    code,
    inv_number,
    severity_code_clean,
    charge,
    person_first_name,
    person_last_name,
    person_age,
    charge_date,
    person_race_abbr,
    person_gender_abbr,
    charge_count
  )

spurs_clean |> glimpse()
Rows: 41,379
Columns: 19
$ arrest_county       <chr> "Aransas", "Bee", "Bee", "Cameron", "Cameron", "Ec…
$ arrest_division     <chr> "Texas Highway Patrol", "Texas Highway Patrol", "T…
$ arrest_id           <dbl> 209274810, 210182331, 223969513, 213772403, 213772…
$ arrest_state        <chr> "TEXAS", "TEXAS", "TEXAS", "TEXAS", "TEXAS", "TEXA…
$ arrest_urn          <chr> "CMAR00171811", "CMAR00172979", "CMAR00183927", "C…
$ arresting_officer   <chr> "Kaitlynn King", "Luke William Kanz", "Justin K Ja…
$ spurs_officer_id    <chr> "Kk26997", "Lk23525", "Jj25825", "Jc05699", "Jc056…
$ statute             <chr> "46.02(A)", "481.121(B)(1)", "46.02(A)", "481.121(…
$ code                <chr> "Penal Code", "Health And Safety Code", "Penal Cod…
$ inv_number          <chr> "2022I-THP3-149480", "2022I-THP3-149608", "2022I-T…
$ severity_code_clean <chr> "Misdemeanor Charges", "Misdemeanor Charges", "Mis…
$ charge              <chr> "Unl Carrying Weapon", "Possession Of Marihuana <2…
$ person_first_name   <chr> "WESLIN", "JOEL", "JESUS", "SERGIO", "SERGIO", "KI…
$ person_last_name    <chr> "LEMKE", "CASAS", "CADENA", "ARMENDAREZ-GARCA", "A…
$ person_age          <dbl> 23, 27, 52, 23, 23, 29, 29, 45, 29, 17, 24, 24, 24…
$ charge_date         <date> 2022-01-01, 2022-01-01, 2022-01-01, 2022-01-01, 2…
$ person_race_abbr    <chr> "W", "H", "H", "H", "H", "H", "H", "H", "H", "H", …
$ person_gender_abbr  <chr> "M", "M", "M", "M", "M", "F", "F", "M", "M", "M", …
$ charge_count        <dbl> 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,…

Exporting to data_processed

spurs_clean |>
write_rds("data-processed/01-clean-spurs.rds")